Show the code
#|
if (get_count) {
oa_count$oa <- openalexR::oa_fetch(
entity = "works",
search = "",
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}Data Management Report
A short description what this is about. This is not a tracditional abstract, but rather something else …
IPBES_TCA_Ch2_technology
%The BuidNo is automatically increased by one each time the report is rendered. It is used to indicate different renderings when the version stays the same%.
All searches are done on all works in OpenAlex. The search in the TCA Corpus is not possibly at the moment, but we are working on it.
The search terms are based on the shared google doc. They are cleaned up for the usage in OpenAlex.
#|
if (get_count) {
oa_count$oa <- openalexR::oa_fetch(
entity = "works",
search = "",
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$oa_years <- openalexR::oa_fetch(
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}The search terms is vision Open Alex search.
#|
if (get_count) {
oa_count$vision <- openalexR::oa_fetch(
title_and_abstract.search = params$st_vision,
count_only = TRUE,
output = "list",
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$vision_years <- openalexR::oa_fetch(
title_and_abstract.search = params$st_vision,
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}The search terms is technology Open Alex search.
#|
if (get_count) {
oa_count$technology <- openalexR::oa_fetch(
title_and_abstract.search = compact(params$st_technology),
count_only = TRUE,
output = "list",
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$technology_years <- openalexR::oa_fetch(
title_and_abstract.search = compact(params$st_technology),
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}Open Alex search.
The search term is vision AND technology
#|
if (get_count) {
oa_count$vision_technology <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$vision_technology_years <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}#|
if (get_count) {
oa_count$vision_technology_subfields <- openalexR::oa_query(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
group_by = "primary_topic.subfield.id",
verbose = TRUE
) |>
openalexR::oa_request() |>
dplyr::bind_rows() |>
dplyr::arrange(key)
## clean up missing or wrong vision_technology_subfields$key_display_name
need_cleaning <- is.na(oa_count$vision_technology_subfields$key_display_name) |
!is.na(as.numeric(oa_count$vision_technology_subfields$key_display_name))
fine <- !need_cleaning
oa_count$vision_technology_subfields <- oa_count$vision_technology_subfields |>
dplyr::filter(fine) |>
dplyr::select(key, key_display_name) |>
dplyr::distinct() |>
merge(y = oa_count$vision_technology_subfields[need_cleaning, -2], by = "key") |>
dplyr::bind_rows(oa_count$vision_technology_subfields[fine, ]) |>
dplyr::group_by(key, key_display_name) |>
dplyr::summarize(count = sum(count))
}Open Alex search.
The search term is vision AND technology
#|
if (get_count) {
oa_count$vision_technology_marine <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ") AND (", params$st_marine, ")")),
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$vision_technology_marine_years <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ") AND (", params$st_marine, ")")),
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}#|
if (get_count) {
oa_count$vision_technology_marine_ids <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ") AND (", params$st_marine, ")")),
verbose = TRUE,
output = "list",
) |>
sapply(
FUN = function(x) {
x$id
}
) |>
unique()
}Open Alex search.
The search term is vision AND technology
# To long search string
if (get_count) {
oa_count$vision_technology_case <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ") AND (", params$st_case, ")")),
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$vision_technology_case_years <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ") AND (", params$st_case, ")")),
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}if (get_count) {
saveRDS(oa_count, params$fn_count)
}technology AND vision Corpus from OpenAlexThe corpus download will be stored in ch2_technology/pages and the arrow database in data/ch2_technology/corpus_complete. This one will be filtered with the TCA / G;obal Corpus and get the final name data/ch2_technology/corpus.
This is not on github!
The corpus can be read by running corpus_read("data/ch2_technology/corpus") which opens the database so that then it can be fed into a dplyr pipeline. After most dplyr functions, the actual data needs to be collected via collect().
Only then is the actual data read!
Needs to be enabled by setting eval: true in the code block below.
#|
tic()
IPBES.R::corpus_download(
pages_dir = file.path(".", "data", "ch2_technology", "pages"),
title_and_abstract_search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
continue = TRUE,
delete_pages_dir = FALSE,
set_size = 2000,
dry_run = TRUE,
verbose = TRUE,
mc_cores = 6
)
toc()The fields author and topics are serialized in the arrow database and need to be unserialized by using unserialize_arrow() on a dataset containing the two columns.
tic()
IPBES.R::corpus_pages_to_arrow(
pages_dir = params$pages_dir,
arrow_dir = params$corpus_complete_dir,
continue = TRUE,
delete_arrow_dir = FALSE,
dry_run = FALSE,
verbose = TRUE,
mc_cores = 3
)
toc()#|
tic()
if (!file.exists(params$fn_ids_tech_in_tca)) {
ids_technology <- IPBES.R::corpus_read(params$corpus_complete) |>
dplyr::select(id) |>
collect() |>
unlist()
ids_tca <- read_corpus(file.path("..", "IPBES_TCA_Corpus", "data", "tca_corpus", "corpus")) |>
dplyr::select(id) |>
collect() |>
unlist()
fn_ids_tech_in_tca <- ids_technology[ids_technology %in% ids_tca]
rm(ids_technology, ids_tca)
saveRDS(ids_tech_in_tca, params$fn_ids_tech_in_tca)
IPBES.R::corpus_filter_ids(
arrow_dir = params$corpus_complete_dir,
arrow_filter_dir = params$corpus_dir,
filter_ids = ids_tech_in_tca
)
rm(ids_tech_in_tca)
}
toc()Check the number of dulicates before running this next block, and then verify the new corpus afterwards. RUN ONLY MANUALY!
#|
ONLY RUN MANUALLY!!!!!!!!!!!!!!!!!!!!!!!
(read_corpus(params$corpus_dir) |> group_by(id) |> summarize(n = n()) |> filter(n > 1) |> collect() |> nrow()) / (corpus_read(params$corpus_dir) |> nrow())
years <- IPBES.R::corpus_read(params$corpus_dir) |>
distinct(publication_year) |>
collect() |>
unlist() |>
as.vector() |>
sort()
lapply(
years,
function(y) {
message("\nProcessing year: ", y)
tic()
dataset <- IPBES.R::corpus_read(params$corpus_dir) |>
dplyr::filter(publication_year == y) |>
dplyr::collect() |>
group_by(id) |>
slice_max(
publication_year,
n = 1,
with_ties = FALSE,
na_rm = TRUE
)
# unlink(
# file.path(params$corpus_dir, paste0("publication_year=", y)),
# recursive = TRUE,
# force = TRUE
# )
arrow::write_dataset(
dataset = dataset,
path = paste0(params$corpus_dir, "_deduplicated"),
partitioning = c("publication_year", "set"),
format = "parquet",
existing_data_behavior = "overwrite"
)
toc()
}
)
(read_corpus("./data/ch2_technology/corpus_deduplicated") |> group_by(id) |> summarize(n = n()) |> filter(n > 1) |> collect() |> nrow()) / (corpus_read("./data/ch2_technology/corpus_deduplicated") |> nrow())
rename corpora now
NOW IF EVERYTHING IS OK, DELETE THE OLD CORPUS AND RENAME THE NEW ONE## Export data for sentiment analysis
#|
if (!file.exists(params$fn_sent_analysis_parquet)) {
corpus_read(params$corpus_dir) |>
dplyr::select(id, publication_year, ab) |>
arrow::write_parquet(params$fn_sent_analysis_parquet)
}technology AND vision in TCA Corpus#|
if (!file.exists(params$fn_random_sample_250)) {
set.seed(14)
read_corpus(params$corpus_dir) |>
dplyr::select(id, doi, author_abbr, display_name, ab) |>
dplyr::rename(abstract = ab, title = display_name) |>
dplyr::collect() |>
dplyr::slice_sample(n = 250) |>
dplyr::mutate(
abstract = substr(abstract, start = 1, stop = 5000)
) |>
writexl::write_xlsx(path = params$fn_random_sample_250)
}## | |
if (!file.exists(params$fn_publications_over_time)) {
read_corpus(params$corpus_tca_dir) |>
dplyr::select(publication_year) |>
dplyr::arrange(publication_year) |>
dplyr::collect() |>
table() |>
as.data.frame() |>
dplyr::mutate(
publication_year = as.integer(as.character(publication_year)),
p = Freq / sum(Freq),
p_cum = cumsum(p)
) |>
dplyr::rename(
count = Freq
) |>
# oa complete
dplyr::left_join(
x = oa_count$oa_years,
by = "publication_year",
suffix = c("", "_tca")
) |>
# oa vision
dplyr::left_join(
y = oa_count$vision_years,
by = "publication_year",
suffix = c("", "_vision")
) |>
# oa technology
dplyr::left_join(
y = oa_count$technology_years,
by = "publication_year",
suffix = c("", "_technology")
) |>
# oa vision technology
dplyr::left_join(
y = oa_count$vision_technology_years,
by = "publication_year",
suffix = c("", "_technology_vision")
) |>
dplyr::rename(
count_oa = count,
p_oa = p,
p_cum_oa = p_cum
) |>
saveRDS(file = params$fn_publications_over_time)
}if (length(list.files(path = dirname(params$fig_publications_over_time), pattern = basename(params$fig_publications_over_time))) < 2) {
sec_axi_fact <- 0.5e-5
figure <- readRDS(params$fn_publications_over_time) |>
dplyr::filter(publication_year >= params$temporal_from) |>
ggplot2::ggplot() +
#
ggplot2::geom_bar(ggplot2::aes(x = publication_year, y = count_tca, fill = "Nmber of publications per year in TCA Corpus"), stat = "identity") +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_oa / sec_axi_fact, color = "Cumulative proportion OA Corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_tca / sec_axi_fact, color = "Cumulative proportion TCA Corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_vision / sec_axi_fact, color = "Cumulative proportion vision only corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_technology / sec_axi_fact, color = "Cumulative proportion technology only corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_technology_vision / sec_axi_fact, color = "Cumulative proportion Technology Corpus"), size = 1.5) +
#
ggplot2::scale_color_manual(
values = c(
"Cumulative proportion OA Corpus" = "#1f77b4",
"Cumulative proportion TCA Corpus" = "black",
"Cumulative proportion vision only corpus" = "#2ca02c",
"Cumulative proportion technology only corpus" = "#d62728",
"Cumulative proportion Technology Corpus" = "#9467bd"
)
) +
ggplot2::scale_fill_manual(
values = c("Nmber of publications per year in TCA Corpus" = "lightgrey")
) +
#
ggplot2::scale_x_continuous(breaks = seq(params$temporal_from, 2030, 10)) +
ggplot2::scale_y_continuous(
"Proportion of publications",
sec.axis = ggplot2::sec_axis(~ . * sec_axi_fact, name = "Cumulative proportion") # divide by 100 to scale back the secondary axis
) +
ggplot2::labs(
title = "Publications over time",
x = "Year",
y = "Number of publications"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.text.y.right = ggplot2::element_text(color = "red"),
legend.position = "inside", # Move the legend to the top left position
legend.justification = c(0.1, 0.9), # Justify the legend to the top left position
legend.background = ggplot2::element_rect(fill = "white", color = "black") # Add a white background to the legend
)
ggplot2::ggsave(
paste0(params$fig_publications_over_time, ".pdf"),
width = 12,
height = 12,
figure
)
ggplot2::ggsave(
paste0(params$fig_publications_over_time, ".png"),
width = 12,
height = 12,
figure
)
rm(figure, sec_axi_fact)
}read.csv("input/ch2_technology/sent_analysis_technology_results_MD.csv") |>
dplyr::group_by(id) |>
dplyr::slice_min(
order_by = row_number(),
n = 1
) |>
saveRDS(file = params$fn_sentiment_results)#|
if (!file.exists(params$fn_sentiment_spatial_data)) {
data <- corpus_read(params$corpus_authors_dir) |>
dplyr::select(
work_id,
institution_country_code
) |>
dplyr::filter(
!is.na(institution_country_code)
) |>
collect() |>
mutate(
iso3c = countrycode::countrycode(
institution_country_code,
origin = "iso2c",
destination = "iso3c"
),
institution_country_code = NULL
) |>
dplyr::left_join(
readRDS(params$fn_sentiment_results) |>
dplyr::select(
work_id = id,
neg,
neu,
pos,
compound
) ,
by = "work_id"
) |>
dplyr::group_by(iso3c) |>
dplyr::summarize(
mean_neg = mean(neg, na.rm = TRUE),
mean_neu = mean(neu, na.rm = TRUE),
mean_pos = mean(pos, na.rm = TRUE),
mean_compound = mean(compound, na.rm = TRUE),
n = n()
) |>
dplyr::arrange(
dplyr::desc(mean_neg)
) |>
# Filter out missing countries - only necessary as analysis not completed yet
dplyr::filter(
!is.nan(mean_neg)
) |>
saveRDS(file = params$fn_sentiment_spatial_data)
}#|
if (length(list.files(path = file.path("maps", "ch2_technology"), pattern = "sentiment_neu_per_countries")) < 4) {
data <- readRDS(params$fn_sentiment_spatial_data)
map <- data |>
map_country_codes(
map_type = "countries",
values = "mean_neu",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean neutral sentiment (0 - 1) - all countries")
map_sel <- data |>
dplyr::filter(n > params$min_count_sentiment_timeseries) |>
map_country_codes(
map_type = "countries",
values = "mean_neu",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean neutral sentiment (0 - 1) - more than 10 works")
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neu_per_countries_all.pdf"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neu_per_countries_all.png"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neu_per_countries_10.pdf"),
width = 12,
height = 8,
map_sel
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neu_per_countries_10.png"),
width = 12,
height = 8,
map_sel
)
rm(map, data)
}#|
if (length(list.files(path = file.path("maps", "ch2_technology"), pattern = "sentiment_pos_per_countries")) < 4) {
data <- readRDS(params$fn_sentiment_spatial_data)
map <- data |>
map_country_codes(
map_type = "countries",
values = "mean_pos",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean positive sentiment (0 - 1) - all countries")
map_sel <- data |>
dplyr::filter(n > params$min_count_sentiment_timeseries) |>
map_country_codes(
map_type = "countries",
values = "mean_pos",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean positive sentiment (0 - 1) - more than 10 works")
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_pos_per_countries_all.pdf"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_pos_per_countries_all.png"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_pos_per_countries_10.pdf"),
width = 12,
height = 8,
map_sel
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_pos_per_countries_10.png"),
width = 12,
height = 8,
map_sel
)
rm(map, data)
}#|
if (length(list.files(path = file.path("maps", "ch2_technology"), pattern = "sentiment_neg_per_countries")) < 4) {
data <- readRDS(params$fn_sentiment_spatial_data)
map <- data |>
map_country_codes(
map_type = "countries",
values = "mean_neg",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean negative sentiment (0 - 1) - all countries")
map_sel <- data |>
dplyr::filter(n > params$min_count_sentiment_timeseries) |>
map_country_codes(
map_type = "countries",
values = "mean_pos",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean negative sentiment (0 - 1) - more than 10 works")
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neg_per_countries_all.pdf"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neg_per_countries_all.png"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neg_per_countries_10.pdf"),
width = 12,
height = 8,
map_sel
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_neg_per_countries_10.png"),
width = 12,
height = 8,
map_sel
)
rm(map, data)
}#|
if (length(list.files(path = file.path("maps", "ch2_technology"), pattern = "sentiment_comp_per_countries")) < 4) {
data <- readRDS(params$fn_sentiment_spatial_data)
map <- data |>
map_country_codes(
map_type = "countries",
values = "mean_compound",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean compound sentiment (-1: negative; 1: positive) - all countries")
map_sel <- data |>
dplyr::filter(n > params$min_count_sentiment_timeseries) |>
map_country_codes(
map_type = "countries",
values = "mean_compound",
geodata_path = params$gdm_dir
) +
ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
ggplot2::ggtitle("Mean compound sentiment (-1: negative; 1: positive) - more than 10 works")
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_comp_per_countries_all.pdf"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_comp_per_countries_all.png"),
width = 12,
height = 8,
map
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_comp_per_countries_10.pdf"),
width = 12,
height = 8,
map_sel
)
ggplot2::ggsave(
file.path("maps", "ch2_technology", "sentiment_comp_per_countries_10.png"),
width = 12,
height = 8,
map_sel
)
rm(map, data)
}#|
if (!file.exists(params$fn_sentiment_temporal_data)) {
data <- readRDS(params$fn_sentiment_results) |>
select(
work_id = id,
year = date,
neg,
neu,
pos,
compound
) |>
dplyr::group_by(year) |>
dplyr::summarize(
neg = mean(neg),
neu = mean(neu),
pos = mean(pos),
compound = mean(compound),
n = n()
) |>
saveRDS(file = params$fn_sentiment_temporal_data)
}#|
if (!file.exists(params$fn_sentiment_marine_temporal_data)) {
data <- readRDS(params$fn_sentiment_results) |>
select(
work_id = id,
year = date,
neg,
neu,
pos,
compound
) |>
dplyr::filter(
work_id %in% oa_count$vision_technology_marine_ids
) |>
dplyr::group_by(year) |>
dplyr::summarize(
neg = mean(neg),
neu = mean(neu),
pos = mean(pos),
compound = mean(compound),
n = n()
) |>
saveRDS(file = params$fn_sentiment_marine_temporal_data)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_over_time")) < 2) {
figure <- readRDS(params$fn_sentiment_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
tidyr::pivot_longer(cols = c(neg, neu, pos, compound), names_to = "type", values_to = "value") |>
ggplot2::ggplot() +
ggplot2::geom_line(aes(x = year, y = value, color = type, linetype = type)) +
ggplot2::scale_color_manual(values = c("black", "red", "blue", "green")) +
ggplot2::labs(
title = paste0("Sentiment Analysis Scores (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Score",
color = "Type",
linetype = "Type"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_over_time.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_over_time.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_over_time_neg_pos")) < 2) {
figure <- readRDS(params$fn_sentiment_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
tidyr::pivot_longer(cols = c(neg, pos), names_to = "type", values_to = "value") |>
ggplot2::ggplot() +
ggplot2::geom_line(aes(x = year, y = value, color = type, linetype = type)) +
ggplot2::scale_color_manual(values = c("black", "red", "blue", "green")) +
ggplot2::labs(
title = paste0("Sentiment Analysis Scores (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Score",
color = "Type",
linetype = "Type"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_over_time_neg_pos.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_over_time_neg_pos.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_marine_over_time_neg_pos")) < 2) {
figure <- readRDS(params$fn_sentiment_marine_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
tidyr::pivot_longer(cols = c(neg, pos), names_to = "type", values_to = "value") |>
ggplot2::ggplot() +
ggplot2::geom_line(aes(x = year, y = value, color = type, linetype = type)) +
ggplot2::scale_color_manual(values = c("black", "red", "blue", "green")) +
ggplot2::labs(
title = paste0("Sentiment Analysis Scores (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Score",
color = "Type",
linetype = "Type"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_marine_over_time_neg_pos.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_marine_over_time_neg_pos.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_neg_over_time")) < 2) {
figure <- readRDS(params$fn_sentiment_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
ggplot2::ggplot() +
ggplot2::geom_line(ggplot2::aes(x = year, y = neg)) +
ggplot2::labs(
title = paste0("Sentiment Analysis negative Score (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Negative score"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_neg_over_time.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_neg_over_time.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_neu_over_time")) < 2) {
figure <- readRDS(params$fn_sentiment_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
ggplot2::ggplot() +
ggplot2::geom_line(ggplot2::aes(x = year, y = neu)) +
ggplot2::labs(
title = paste0("Sentiment Analysis neutral Score (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Neutral score"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_neu_over_time.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_neu_over_time.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_pos_over_time")) < 2) {
figure <- readRDS(params$fn_sentiment_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
ggplot2::ggplot() +
ggplot2::geom_line(ggplot2::aes(x = year, y = pos)) +
ggplot2::labs(
title = paste0("Sentiment Analysis positive Score (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Positive score"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_pos_over_time.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_pos_over_time.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_comp_over_time")) < 2) {
figure <- readRDS(params$fn_sentiment_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
ggplot2::ggplot() +
ggplot2::geom_line(ggplot2::aes(x = year, y = compound)) +
ggplot2::labs(
title = paste0("Sentiment Analysis Compound Score (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Compound score"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_comp_over_time.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_comp_over_time.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_marine_over_time_neg_pos_ridge")) < 2) {
figure <- readRDS(params$fn_sentiment_results) |>
select(
work_id = id,
year = date,
neg,
neu,
pos,
compound
) |>
dplyr::ungroup() |>
dplyr::filter(
year >= 1980
) |>
mutate(
year = as.factor(year)
) |>
ggplot() +
geom_density_ridges(aes(x = pos, y = year, fill = "positive"), rel_min_height = 0.005) +
geom_density_ridges(aes(x = -neg, y = year, fill = "negative"), rel_min_height = 0.005)
# Calculate the number of points per year
counts <- figure$data %>%
group_by(year) %>%
summarise(
n = n()
)
# Add the number of points per year as text
figure <- figure +
geom_text(data = counts, aes(x = Inf, y = year, label = n), hjust = "inward", vjust = 0.5)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_marine_over_time_neg_pos_ridge.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_marine_over_time_neg_pos_ridge.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (length(list.files(path = file.path("figures", "ch2_technology"), pattern = "sentiments_over_time_neg_pos")) < 2) {
figure <- readRDS(params$fn_sentiment_temporal_data) |>
dplyr::filter(
n > params$min_count_sentiment_timeseries
) |>
tidyr::pivot_longer(cols = c(neg, pos), names_to = "type", values_to = "value") |>
ggplot2::ggplot() +
ggplot2::geom_line(aes(x = year, y = value, color = type, linetype = type)) +
ggplot2::scale_color_manual(values = c("black", "red", "blue", "green")) +
ggplot2::labs(
title = paste0("Sentiment Analysis Scores (n > ", params$min_count_sentiment_timeseries, ")"),
x = "Year",
y = "Score",
color = "Type",
linetype = "Type"
) +
ggplot2::theme_minimal()
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_over_time_neg_pos.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "ch2_technology", "sentiments_over_time_neg_pos.png"),
width = 12,
height = 6,
figure
)
rm(figure)
}#|
if (!file.exists(params$fn_marine_sentiment_all)) {
corpus_read(params$corpus_dir) |>
dplyr::filter(
id %in% oa_count$vision_technology_marine_ids
) |>
dplyr::select(id, doi, author_abbr, display_name, ab) |>
dplyr::rename(abstract = ab, title = display_name) |>
dplyr::collect() |>
dplyr::slice_sample(n = 250) |>
dplyr::mutate(
abstract = substr(abstract, start = 1, stop = 5000)
) |>
dplyr::left_join(
y = readRDS(params$fn_sentiment_results) |>
dplyr::select(
id,
neg,
neu,
pos,
compound
),
by = "id"
) |>
writexl::write_xlsx(path = params$fn_marine_sentiment_all)
}
if (!file.exists(params$fn_marine_sentiment)) {
corpus_read(params$corpus_dir) |>
dplyr::filter(
id %in% oa_count$vision_technology_marine_ids
) |>
dplyr::select(id, doi, author_abbr, display_name, ab) |>
dplyr::rename(abstract = ab, title = display_name) |>
dplyr::collect() |>
dplyr::slice_sample(n = 250) |>
dplyr::mutate(
abstract = substr(abstract, start = 1, stop = 5000)
) |>
dplyr::left_join(
y = readRDS(params$fn_sentiment_results) |>
dplyr::select(
id,
neg,
neu,
pos,
compound
),
by = "id"
) |>
writexl::write_xlsx(path = params$fn_marine_sentiment)
}The results are based on data downloaded or accessed at:
c_time <- list.files(
path = params$pages_dir,
recursive = TRUE,
pattern = ".rds$",
full.names = TRUE
) |>
file.mtime() |>
as.Date() |>
unique()./data/ch2_technology/corpus downloaded at 2024-03-27 from OpenAlexvision AND technology in TCA CorpusFor the TCA Corpus, we do have 490,201 number of works.
An Excel file conataining a random sample of 250 works from the Technology Corpus (technology AND vision AND nature AND transformativechange) with the fields id, doi, author_abbr and abstract of the papers. The Excel file can be downloaded from here.
The subfields are based on the main topic assigned to each work. There are other topics also assigned, but this one has been identified as the main topic by an algorythm. count is the number of works in the vision AND technology corpus which have been assigned to the subfield.
Please take a look at these subfields of the topics to identify the ones to be filtered out.
The easies would be to download the Excel file through the button and to mark the subfields to be filtered out.
IPBES.R::table_dt((oa_count$vision_technology_subfields |> dplyr::arrange(desc(count))), fixedColumns = NULL, fn = "Vision Technology Subfields")A pdf of the graph can be downloaded here.
Two .parquet files containing the id, publication_year and ab (abstract) were extracted and are available upon request due to their size.
For analyzing the sentiments of the provided abstracts, we have used the Python NLTK package, and VADER (Valence Aware Dictionary for Sentiment Reasoning) which is an NLTK module that provides sentiment scores based on the words used. VADER is a pre-trained, rule-based sentiment analysis model in which the terms are generally labeled as per their semantic orientation as either positive or negative.
The main advantage/reason for using this model was that it doesn’t require a labbed training dataset. The output of the model is 4 statistical scores:
#|
readRDS(params$fn_sentiment_results) |>
IPBES.R::table_dt(fn = "sentiment_scores", fixedColumns = list(leftColumns = 2))Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html